in this document I creat a new version for the flowcam classifier at 18 degrees in december 2021. There are new classes: 1 for small unidentified particles smaller than 10, which might be small digested algae and 1 for dividing chlamydomonas. I also cleaned the species libraries by eliminating particles that were outliers in any variables. This eliminated about 55 of the data

Dependencies

library(tidyverse)
library(randomForest)
library(readr)
library(viridis)
library(e1071)
library(caret)

set.seed(1)

Data

colnames <- c("Particle_ID","Area_ABD","Area_Filled","Aspect_Ratio","Average_Blue",
              "Average_Green","Average_Red","Calibration_Factor","Calibration_Image",
              "Camera","Capture_X","Capture_Y","Ch1_Area","Ch1_Peak","Ch1_Width",
              "Ch2_Area","Ch2_Peak","Ch2_Width","Ch2_Ch1_Ratio","Circle_Fit",
              "Circularity","Circularity_Hu","Compactness","Convex_Perimeter",
              "Convexity","Date","Diameter_ABD","Diameter_ESD","Edge_Gradient",
              "Elongation","Feret_Angle_Max","Feret_Angle_Min","Fiber_Curl",
              "Fiber_Straightness","Filter_Score","Geodesic_Aspect_Ratio",
              "Geodesic_Length","Geodesic_Thickness","Image_File","Image_Height",
              "Image_Width","Image_X","Image_Y","Intensity","Length",
              "Particles_Per_Chain","Perimeter","Ratio_Blue_Green","Ratio_Red_Blue",
              "Ratio_Red_Green","Roughness","Scatter_Area","Scatter_Peak","Scatter_Width",
              "Sigma_Intensity","Source_Image","Sum_Intensity","Symmetry","Time",
              "Timestamp","Transparency","Volume_ABD","Volume_ESD","Width","species")

dd_all <- read_csv("Data/libraries_FC100/dd_all.csv", show_col_types = FALSE)
dd_all$species <- ifelse(dd_all$species=="small_unidentified_possibly_small_digested_algae", "Small_unidentified",dd_all$species)

table(dd_all$species)

filtering out outliers

dd_all$id <- 1:nrow(dd_all)
dd_all_long <- dd_all %>%
  dplyr::select(species, Area_ABD,Area_Filled,Aspect_Ratio,Average_Blue,
                Average_Green,Average_Red,Circle_Fit,Circularity,
                Compactness,Convexity,Diameter_ABD,Diameter_ESD,Edge_Gradient,
                Elongation,Feret_Angle_Max,Feret_Angle_Min,Fiber_Curl,
                Fiber_Straightness,Geodesic_Aspect_Ratio,Intensity,
                Length,Perimeter,Ratio_Blue_Green,Ratio_Red_Blue,
                Ratio_Red_Green,Roughness,Sigma_Intensity,Sum_Intensity,
                Symmetry,Transparency,Width, id) %>%
  pivot_longer(cols=-c(id,species), names_to="variable") %>%
  dplyr::group_by(variable,species) %>%
  mutate(iqr = IQR(value),
         first_quartile = quantile(value, probs = 0.25),
         third_quartile = quantile(value, probs = 0.75),
         outlier = ifelse(value<first_quartile-1.5*iqr | value>third_quartile+1.5*iqr,T,F))



outliers <- dd_all_long %>%
  dplyr::filter(outlier==T) %>%
  dplyr::group_by(species, id) %>%
  dplyr::summarise(n = n()) %>%
  dplyr::filter(n>6)
## `summarise()` has grouped output by 'species'. You can override using the `.groups` argument.
table(outliers$species)
## 
##            airbubbles         Chlamydomonas   ChlamydomonasClumps 
##                    15                    42                     9 
##         Coleps_irchel             Colpidium     ColpidiumVacuoles 
##                    39                    11                    31 
##             Cosmarium           Cryptomonas                Debris 
##                    20                    37                    10 
##           Desmodesmus            Dexiostoma         DigestedAlgae 
##                    73                    18                     5 
## DividingChlamydomonas         Loxocephallus         Monoraphidium 
##                    13                    18                    83 
##          OtherCiliate    Small_unidentified          Staurastrum1 
##                     1                   219                    29 
##          Staurastrum2           Tetrahymena 
##                     3                     9
nrow(outliers)/nrow(dd_all)
## [1] 0.05332399
dd_all_filtered <- dd_all %>%
  dplyr::filter(!is.element(id,outliers$id))

dd_all$removed_outliers <- F
dd_all_filtered$removed_outliers <- T

dd_all_comparison <- rbind(dd_all,dd_all_filtered)


dd_all_comparison %>% 
  ggplot(aes(log10(Area_ABD), col=removed_outliers))+
  geom_histogram(aes(fill=removed_outliers, y=..density..), position = "identity", alpha=0.3) +
  geom_boxplot(outlier.alpha = 0.3) +
  theme_bw() +
  facet_wrap(~species, scales = "free_y")  +
  geom_vline(xintercept = 1, col="black")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

dd_all_filtered %>%
  ggplot(aes(log10(Area_ABD)))+
  geom_density(aes(col=species))

Species compositions

species <- unique(dd_all$species)
species <- species[!is.element(species,c("airbubbles","ColpidiumVacuoles","Debris","Small_unidentified",
                                         "OtherCiliate","ChlamydomonasClumps","DigestedAlgae","DividingChlamydomonas"))]
compositions <- read_csv("../compositions.csv")
## Rows: 15 Columns: 20
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (1): composition
## dbl (19): richness, Chlamydomonas, Cryptomonas, Monoraphidium, Cosmarium, St...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
comp_name <- compositions$composition

compositions <- compositions[,species]

compositions_list <- apply(compositions, 1, function(x){
  idx <- which(x==1)
  names(idx)
})

names(compositions_list) <- comp_name

Classifiers

classifier_plot_svm <- function(table, combination.nr){
  # cm <- classifier$confusion
  cm <- table
  ncol <- ncol(cm)
  cm <- apply(cm, 1, function(x) x/sum(x))
  
  cm_long <- data.frame(Predicted = factor(rep(rownames(cm),ncol), levels = rownames(cm)),
                        Truth = factor(rep(colnames(cm), each=ncol), levels = colnames(cm)),
                        Classification = as.vector(cm))
  
  plot <- cm_long %>%
    ggplot(aes(Predicted,Truth,fill=Classification))+
    geom_tile() +
    geom_text(aes(label = round(Classification, 3)), col="white") +
    scale_x_discrete(position = "top") +
    scale_y_discrete(limits = rev(levels(cm_long$Truth))) +
    scale_fill_viridis(option = "D", end=.8, discrete=FALSE)+
    theme(axis.text.x = element_text(angle = 90, hjust = 0))+
    theme(legend.text = element_text(size=14),
          axis.title = element_text(size=14),
          title = element_text(size=16),
          axis.text = element_text(size=13))+
    labs(title=paste("PPV of",combination.nr),fill="PPV")
  
  return(plot)
}


classifier_assessment_plot <- function(cf, combination.nr){

  cf.df <- cf$byClass[,1:4] %>%
    as.data.frame() %>%
    mutate(Species = gsub("Class: ", "", rownames(cf$byClass[,1:5]))) %>%
    rename("NPV" = "Neg Pred Value", "PPV" = "Pos Pred Value",
           "Sens" = "Sensitivity", "Spec" = "Specificity") %>%
    pivot_longer(cols = 1:4) %>%
    mutate(col = ifelse(value>=0.9,"1",
                        ifelse(value<0.9 & value>=0.8,"2",
                               ifelse(value<0.8 & value>=0.7,"3","4"))))

  plot <- cf.df %>%
    ggplot(aes(name,Species,fill=col))+
    geom_tile() +
    geom_text(aes(label = round(value, 3)), col="white") +
    scale_x_discrete(position = "top") +
    scale_y_discrete(limits = rev(levels(as.factor(cf.df$Species)))) +
    scale_fill_manual(values = c("#006837","#66bd63","#f46d43","#a50026"), breaks = c("1","2","3","4")) +
    theme(legend.text = element_text(size=14),
          title = element_text(size = 16),
          axis.title = element_blank(),
          axis.text = element_text(size=13),
          legend.position = "none")+
    labs(title=combination.nr, fill="")
  
  return(plot)
}
formula <- factor(species) ~ Area_ABD + Area_Filled + Aspect_Ratio +
      Average_Blue + Average_Green + Average_Red + Circle_Fit +
      Circularity + Compactness +
      Convexity + Diameter_ABD + Diameter_ESD + Edge_Gradient +
      Elongation + Feret_Angle_Max + Feret_Angle_Min + Fiber_Curl +
      Fiber_Straightness + Geodesic_Aspect_Ratio + Intensity + Length +
      Perimeter + Ratio_Blue_Green + Ratio_Red_Blue + Ratio_Red_Green +
      Roughness + Sigma_Intensity + Sum_Intensity + Symmetry + Transparency +
      Width

set.seed(62)
classifiers_18c <- list()
classifiers_18c_plots <- list()
classifiers_18c_assess_plots <- list()

trainingdata <- dd_all[complete.cases(dd_all), ]


for(i in 1:length(compositions_list)){
  
  if("Colpidium" %in% compositions_list[[i]]) {
    sub_trainingdata <- trainingdata %>%
      filter(species %in% c(compositions_list[[i]],"airbubbles","ColpidiumVacuoles","Debris","Small_unidentified",
                            "OtherCiliate","ChlamydomonasClumps","DigestedAlgae","DividingChlamydomonas"))
  } else {
    sub_trainingdata <- trainingdata %>%
      filter(species %in% c(compositions_list[[i]],"airbubbles","Debris","OtherCiliate","ChlamydomonasClumps","Small_unidentified",
                            "DigestedAlgae","DividingChlamydomonas"))
  }

  n.var <- length(unique(sub_trainingdata$species))
  sub_trainingdata$species <- factor(sub_trainingdata$species)
  
  split_up <- split(sub_trainingdata, f = sub_trainingdata$species)
  subsamples <- lapply(split_up, function(x) {
    x %>% sample_n(ifelse(nrow(x) < 500, nrow(x), 500))})
  sub_trainingdata <- do.call("rbind", subsamples)
  
  # table(sub_trainingdata$species)
  # A stratified random split of the data
  idx_train <- createDataPartition(sub_trainingdata$species,
                                   p = 0.7, # percentage of data as training
                                   list = FALSE)
  
  sub_testdata <- sub_trainingdata[-idx_train,]
  sub_trainingdata <- sub_trainingdata[idx_train,]
  
  n.min <- min(table(sub_trainingdata$species))
  
  obj <- tune(svm, formula, data = sub_trainingdata, kernel="radial",
            ranges = list(gamma = 2^(-8:2), cost = 2^(1:10)), probability = TRUE,
            tunecontrol = tune.control(sampling = "fix", best.model = T))
  
  classifiers_18c[[i]] <- obj$best.model
  
  cf <- caret::confusionMatrix(predict(obj$best.model, newdata=sub_testdata %>% select(-species)),
                       sub_testdata$species)
  
  classifiers_18c_plots[[i]] <- classifier_plot_svm(table = cf$table,
                                                        combination.nr = names(compositions_list)[[i]])
  
  classifiers_18c_assess_plots[[i]] <- classifier_assessment_plot(cf, names(compositions_list)[[i]])
}

names(classifiers_18c_plots) <- names(classifiers_18c) <-
  names(classifiers_18c_assess_plots) <- comp_name
library(patchwork)
for(i in 1:length(classifiers_18c_plots)){
  print(classifiers_18c_plots[[i]] + classifiers_18c_assess_plots[[i]] + 
    plot_layout(widths = c(7,3)))
}

Save classifiers

saveRDS(classifiers_18c, "svm_flowcam_classifiers_18c_trained_december2021.rds")
# cl <- readRDS("classifiers_18c_25x.rds")
# labels <- dd_all %>% 
#   group_by(species) %>%  
#   summarise(xPos = max(Area_Filled),
#             yPos = max((density(Area_Filled))$y))
# 
# library(directlabels)
# direct.label(qplot(Area_Filled,data=dd_all,colour=species,geom="density",log="x"))